#   LibAut
#       outcome regressions: multiple, leave-pair-out CV
#       LPO AUC for Table 1 column (model) 4, and Table C.5
#   Juraj Medzihorsky
#   2021-08-11

library(logistf)
library(parallel)
options(mc.cores=14) # written for a 16-thread linux machine
options(stringsAsFactors=F)
source('helpers.R')

#   sessionInfo()
##  R version 4.0.4 (2021-02-15)
##  Platform: x86_64-pc-linux-gnu (64-bit)
##  Running under: Ubuntu 20.04.2 LTS
##
##  Matrix products: default
##  BLAS/LAPACK: /opt/OpenBLAS/lib/libopenblas_haswellp-r0.3.13.so
##
##  locale:
##   [1] LC_CTYPE=C.UTF-8           LC_NUMERIC=C
##   [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C.UTF-8
##   [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C.UTF-8
##   [7] LC_PAPER=C.UTF-8           LC_NAME=C
##   [9] LC_ADDRESS=C               LC_TELEPHONE=C
##  [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
##  attached base packages:
##  [1] parallel  stats     graphics  grDevices utils     datasets  methods
##  [8] base
##
##  other attached packages:
##  [1] logistf_1.24   nvimcom_0.9-92
##
##  loaded via a namespace (and not attached):
##   [1] Rcpp_1.0.6           magrittr_2.0.1       splines_4.0.4
##   [4] tidyselect_1.1.0     lattice_0.20-41      R6_2.5.0
##   [7] rlang_0.4.10         dplyr_1.0.4          tools_4.0.4
##  [10] grid_4.0.4           broom_0.7.5          nlme_3.1-152
##  [13] mgcv_1.8-34          DBI_1.1.1            ellipsis_0.3.1
##  [16] formula.tools_1.7.1  assertthat_0.2.1     tibble_3.0.6
##  [19] lifecycle_1.0.0      crayon_1.4.1         Matrix_1.3-2
##  [22] tidyr_1.1.2          purrr_0.3.4          operator.tools_1.6.3
##  [25] vctrs_0.3.6          glue_1.4.2           mice_3.13.0
##  [28] compiler_4.0.4       pillar_1.4.7         backports_1.2.1
##  [31] generics_0.1.0       pkgconfig_2.0.3

#------------------------------------------------------------------------------

code_dir <- getwd()
data_dir <- gsub('code$', 'data', code_dir)

setwd(data_dir)
dat <- readRDS('outcome_multiple_20210810.rds')
setwd(code_dir)

dat$reg_fac <- as.factor(dat$e_regionpol_6C)
dat <- subset(dat, !(dem_ep_id%in%'HRV_1992_2004'))

#------------------------------------------------------------------------------
#   models

#   formulas
fl0 <- fy0 <- epy ~ 1
linchunk1 <- ~ . + 
    reg_fac +
    lag1_e_migdppcln + 
    lag1_e_migdpgro + 
    lag1_logpop +
    lag1_v2pepwrsoc + 
    lag1_v2xeg_eqdr + 
    lag1_v2xnp_pres + 
    lag1_v2csprtcpt +
    lag1_v2x_polyarchy +
    lag1_excl_region_edi + 
    lag1_e_miinteco + 
    lag1_e_miinterc  
linchunk2 <- ~ . + 
    I(1e-1*(year-1990)) + I(1e-1*(year-1990)^2) + I(1e-1*(year-1990)^3) 
fl1 <- update(fy0, linchunk1)
fl2 <- update(fl1, linchunk2)


#   gaussian-linear model under MLE for one left-out pair
onemlegau <-
    function(ii, dat, form)
    {
        ii <- unlist(ii)    
        m <- lm(form, data=dat[-ii,])
        p <- predict(m, newdata=dat[ii,], type='response')
        auc <- as.numeric(p[1]<p[2])
        cc <- mean(round(p)==dat[ii,'epy'])
        return(data.frame(auc=auc, cc=cc))
    }


#   bernoulli-logistic model under MLE for one left-out pair
onemlelgt <-
    function(ii, dat, form)
    {
        ii <- unlist(ii)    
        m <- glm(form, data=dat[-ii,], family=binomial('logit'))
        p <- predict(m, newdata=dat[ii,], type='response')
        auc <- as.numeric(p[1]<p[2])
        cc <- mean(round(p)==dat[ii,'epy'])
        return(data.frame(auc=auc, cc=cc))
    }


#   bernoulli-logistic model under FPL for one left-out pair
onelpo <-
    function(ii, dat, nd, form,
             mcont=logistf.control(maxit=1e4, maxstep=1e4))
    {
        ii <- unlist(ii)
        m <- logistf(form, data=dat[-ii,], control=mcont)
        lp <- nd[ii,] %*% matrix(coef(m), ncol=1)
        p <- plogis(lp)
        auc <- as.numeric(p[1]<p[2])
        cc <- mean(round(p)==dat[ii,'epy'])
        return(data.frame(auc=auc, cc=cc))
    }


#------------------------------------------------------------------------------

#   grid of all possible 0-1 pairs
gp <- expand.grid(y0=which(dat$epy%in%0),
                  y1=which(dat$epy%in%1))

#   model matrices for FPL
mdat1 <- model.matrix(fl1, dat)
mdat2 <- model.matrix(fl2, dat)

#------------------------------------------------------------------------------
#   execute

rr <- 1:nrow(gp)

system.time(mle_gau_1 <- do.call(rbind, mclapply(rr, function(j) onemlegau(gp[j,], dat=dat, form=fl1))))
system.time(mle_gau_2 <- do.call(rbind, mclapply(rr, function(j) onemlegau(gp[j,], dat=dat, form=fl2))))
system.time(mle_lgt_1 <- do.call(rbind, mclapply(rr, function(j) onemlelgt(gp[j,], dat=dat, form=fl1))))
system.time(mle_lgt_2 <- do.call(rbind, mclapply(rr, function(j) onemlelgt(gp[j,], dat=dat, form=fl2))))
system.time(fpl_lgt_1 <- do.call(rbind, mclapply(rr, function(j) onelpo(gp[j,], dat=dat, nd=mdat1, form=fl1))))
system.time(fpl_lgt_2 <- do.call(rbind, mclapply(rr, function(j) onelpo(gp[j,], dat=dat, nd=mdat2, form=fl2))))

#------------------------------------------------------------------------------
#   save

setwd(data_dir)
save(list=c('mle_gau_1', 'mle_gau_2',
            'mle_lgt_1', 'mle_lgt_2', 
            'fpl_lgt_1', 'fpl_lgt_2'), 
     file='outcome_regressions_multiple_lpo_20210811.rdata')

#   round(rbind(
#         cbind(mean(mle_gau_1$auc), mean(mle_gau_2$auc),
#               mean(mle_lgt_1$auc), mean(mle_lgt_2$auc),
#               mean(fpl_lgt_1$auc), mean(fpl_lgt_2$auc)),
#         cbind(mean(mle_gau_1$cc), mean(mle_gau_2$cc),
#               mean(mle_lgt_1$cc), mean(mle_lgt_2$cc),
#               mean(fpl_lgt_1$cc), mean(fpl_lgt_2$cc))),
#         2)
#   first row, column 6 goes into Table 1 column (model) 4   
#   the first row goes into Table C.5 
#   # 0.86 0.88 0.86 0.88 0.86 0.87
#   # 0.77 0.78 0.78 0.77 0.77 0.77

#   SCRIPT END